{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# -*- coding:utf8 -*-\n",
    "from sklearn.externals import joblib\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import os\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn import metrics\n",
    "import urllib.parse\n",
    "from sklearn.pipeline import Pipeline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def loadFile(name):\n",
    "    directory = str(os.getcwd())\n",
    "    filepath = os.path.join(directory, name)\n",
    "    with open(filepath,'r',encoding=\"utf8\") as f:\n",
    "        data = f.readlines()\n",
    "    data = list(set(data))\n",
    "    result = []\n",
    "    for d in data:\n",
    "        d = str(urllib.parse.unquote(d))   #converting url encoded data to simple string\n",
    "        result.append(d)\n",
    "    return result\n",
    "\n",
    "badQueries = loadFile('badqueries.txt')\n",
    "validQueries = loadFile('goodqueries.txt')\n",
    "\n",
    "badQueries = list(set(badQueries))\n",
    "validQueries = list(set(validQueries))\n",
    "allQueries = badQueries + validQueries\n",
    "yBad = [1 for i in range(0, len(badQueries))]  #labels, 1 for malicious and 0 for clean\n",
    "yGood = [0 for i in range(0, len(validQueries))]\n",
    "y = yBad + yGood\n",
    "queries = allQueries\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(queries, y, test_size=0.2, random_state=42) #splitting data\n",
    "badCount = len(badQueries)\n",
    "validCount = len(validQueries)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bad samples: 44532\n",
      "Good samples: 1265974\n",
      "Baseline Constant negative: 0.966019\n",
      "------------\n",
      "Accuracy: 0.999447\n",
      "Precision: 0.985177\n",
      "Recall: 0.998520\n",
      "F1-Score: 0.991804\n",
      "AUC: 0.999841\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['lr.pkl']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipe_lr = Pipeline([('tfidf', TfidfVectorizer(min_df = 0.0, analyzer=\"char\", sublinear_tf=True, ngram_range=(1,3))),\n",
    "                    ('clf', LogisticRegression(class_weight={1: 2 * validCount / badCount, 0: 1.0}))\n",
    "                    ])\n",
    "pipe_lr.fit(X_train, y_train)\n",
    "\n",
    "predicted = pipe_lr.predict(X_test)\n",
    "predicted=list(predicted)\n",
    "fpr, tpr, _ = metrics.roc_curve(y_test, (pipe_lr.predict_proba(X_test)[:, 1]))\n",
    "auc = metrics.auc(fpr, tpr)\n",
    "\n",
    "print(\"Bad samples: %d\" % badCount)\n",
    "print(\"Good samples: %d\" % validCount)\n",
    "print(\"Baseline Constant negative: %.6f\" % (validCount / (validCount + badCount)))\n",
    "print(\"------------\")\n",
    "print(\"Accuracy: %f\" % pipe_lr.score(X_test, y_test))  #checking the accuracy\n",
    "print(\"Precision: %f\" % metrics.precision_score(y_test, predicted))\n",
    "print(\"Recall: %f\" % metrics.recall_score(y_test, predicted))\n",
    "print(\"F1-Score: %f\" % metrics.f1_score(y_test, predicted))\n",
    "print(\"AUC: %f\" % auc)\n",
    "joblib.dump(pipe_lr,\"lr.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
